suppressMessages(
suppressWarnings(
c(library(org.Mm.eg.db),
library(AnnotationDbi),
library(data.table),
library(tidyverse),
library(rtracklayer),
library(RColorBrewer),
library(gplots),
library(clusterProfiler),
library(fgsea))
)
)
## [1] "org.Mm.eg.db" "AnnotationDbi" "IRanges" "S4Vectors"
## [5] "Biobase" "BiocGenerics" "parallel" "stats4"
## [9] "stats" "graphics" "grDevices" "utils"
## [13] "datasets" "methods" "base" "org.Mm.eg.db"
## [17] "AnnotationDbi" "IRanges" "S4Vectors" "Biobase"
## [21] "BiocGenerics" "parallel" "stats4" "stats"
## [25] "graphics" "grDevices" "utils" "datasets"
## [29] "methods" "base" "data.table" "org.Mm.eg.db"
## [33] "AnnotationDbi" "IRanges" "S4Vectors" "Biobase"
## [37] "BiocGenerics" "parallel" "stats4" "stats"
## [41] "graphics" "grDevices" "utils" "datasets"
## [45] "methods" "base" "forcats" "stringr"
## [49] "dplyr" "purrr" "readr" "tidyr"
## [53] "tibble" "ggplot2" "tidyverse" "data.table"
## [57] "org.Mm.eg.db" "AnnotationDbi" "IRanges" "S4Vectors"
## [61] "Biobase" "BiocGenerics" "parallel" "stats4"
## [65] "stats" "graphics" "grDevices" "utils"
## [69] "datasets" "methods" "base" "rtracklayer"
## [73] "GenomicRanges" "GenomeInfoDb" "forcats" "stringr"
## [77] "dplyr" "purrr" "readr" "tidyr"
## [81] "tibble" "ggplot2" "tidyverse" "data.table"
## [85] "org.Mm.eg.db" "AnnotationDbi" "IRanges" "S4Vectors"
## [89] "Biobase" "BiocGenerics" "parallel" "stats4"
## [93] "stats" "graphics" "grDevices" "utils"
## [97] "datasets" "methods" "base" "RColorBrewer"
## [101] "rtracklayer" "GenomicRanges" "GenomeInfoDb" "forcats"
## [105] "stringr" "dplyr" "purrr" "readr"
## [109] "tidyr" "tibble" "ggplot2" "tidyverse"
## [113] "data.table" "org.Mm.eg.db" "AnnotationDbi" "IRanges"
## [117] "S4Vectors" "Biobase" "BiocGenerics" "parallel"
## [121] "stats4" "stats" "graphics" "grDevices"
## [125] "utils" "datasets" "methods" "base"
## [129] "gplots" "RColorBrewer" "rtracklayer" "GenomicRanges"
## [133] "GenomeInfoDb" "forcats" "stringr" "dplyr"
## [137] "purrr" "readr" "tidyr" "tibble"
## [141] "ggplot2" "tidyverse" "data.table" "org.Mm.eg.db"
## [145] "AnnotationDbi" "IRanges" "S4Vectors" "Biobase"
## [149] "BiocGenerics" "parallel" "stats4" "stats"
## [153] "graphics" "grDevices" "utils" "datasets"
## [157] "methods" "base" "clusterProfiler" "gplots"
## [161] "RColorBrewer" "rtracklayer" "GenomicRanges" "GenomeInfoDb"
## [165] "forcats" "stringr" "dplyr" "purrr"
## [169] "readr" "tidyr" "tibble" "ggplot2"
## [173] "tidyverse" "data.table" "org.Mm.eg.db" "AnnotationDbi"
## [177] "IRanges" "S4Vectors" "Biobase" "BiocGenerics"
## [181] "parallel" "stats4" "stats" "graphics"
## [185] "grDevices" "utils" "datasets" "methods"
## [189] "base" "fgsea" "Rcpp" "clusterProfiler"
## [193] "gplots" "RColorBrewer" "rtracklayer" "GenomicRanges"
## [197] "GenomeInfoDb" "forcats" "stringr" "dplyr"
## [201] "purrr" "readr" "tidyr" "tibble"
## [205] "ggplot2" "tidyverse" "data.table" "org.Mm.eg.db"
## [209] "AnnotationDbi" "IRanges" "S4Vectors" "Biobase"
## [213] "BiocGenerics" "parallel" "stats4" "stats"
## [217] "graphics" "grDevices" "utils" "datasets"
## [221] "methods" "base"
# lead the miRNA list with peaks associated
mirs.peaks <- readRDS("Datafiles/miRNA-peaks-list-09282019-withIDs.rds")
len <- sapply(mirs.peaks, function(x) length(x))
mirs.peaks <- mirs.peaks[order(-len)]
mirna <- names(mirs.peaks)
Run KEGG analysis with the whole universe as background.
KEGG.list <- list()
mirname <- c()
for (i in 1:20) {
sig.genes <- mirs.peaks[[i]]$target_Entrez_ID
kk <- enrichKEGG(gene = sig.genes, organism = 'mmu')
enriched.number <- dim(kk)[1]
print(paste("Number of KEGG pathway enriched for", mirna[i], ":", enriched.number))
if (enriched.number > 0) {
KEGG.list <- c(KEGG.list, list(as.data.frame(kk)))
d <- dotplot(kk, showCategory = 20) + labs(title = paste("Enriched KEGG pathways for", mirna[i], "targets"))
print(d)
mirname <- c(mirname,mirna[i])
write.csv(as.data.frame(kk), paste("KEGG Analysis/", str_remove_all(mirna[i], "/"), "_KEGG.csv", sep = ""))
}
}
## [1] "Number of KEGG pathway enriched for let-7-5p/miR-98-5p : 0"
## [1] "Number of KEGG pathway enriched for miR-29-3p : 16"
## [1] "Number of KEGG pathway enriched for miR-15-5p/16-5p/195-5p/322-5p/497-5p : 0"
## [1] "Number of KEGG pathway enriched for miR-17-5p/20-5p/93-5p/106-5p : 0"
## [1] "Number of KEGG pathway enriched for miR-200-3p/429-3p : 56"
## [1] "Number of KEGG pathway enriched for miR-24-3p : 0"
## [1] "Number of KEGG pathway enriched for miR-194-5p : 0"
## [1] "Number of KEGG pathway enriched for miR-27-3p : 0"
## [1] "Number of KEGG pathway enriched for miR-103-3p/107-3p : 0"
## [1] "Number of KEGG pathway enriched for miR-23-3p : 0"
## [1] "Number of KEGG pathway enriched for miR-196-5p : 1"
## [1] "Number of KEGG pathway enriched for miR-19-3p : 0"
## [1] "Number of KEGG pathway enriched for miR-26-5p : 0"
## [1] "Number of KEGG pathway enriched for miR-25-3p/32-5p/92-3p/363-3p/367-3p : 8"
## [1] "Number of KEGG pathway enriched for miR-21-3p : 0"
## [1] "Number of KEGG pathway enriched for miR-141-3p : 0"
## [1] "Number of KEGG pathway enriched for miR-484 : 0"
## [1] "Number of KEGG pathway enriched for miR-30-5p/384-5p : 0"
## [1] "Number of KEGG pathway enriched for miR-130-3p/301-3p : 0"
## [1] "Number of KEGG pathway enriched for miR-22-3p : 1"
names(KEGG.list) <- mirname
saveRDS(KEGG.list, "Datafiles/peak.KEGG.result.rds")
I would like to do GSEA analysis on KEGG pathways for each miRNA’s associated peaks.
# load the proteomics datafile
protein_DGE <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/colon tumor-enema model/crcMS_diff.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## `Protein Id` = col_character(),
## Description = col_character(),
## p_values = col_double(),
## q_values = col_double(),
## foldChange = col_double(),
## LFC = col_double()
## )
# annotate the proteomics dataset with Entrez ID
# Entrez IDs are annotated using `org.Mm.eg.db` package since this is the most updated
annotations_entrez <- AnnotationDbi::select(org.Mm.eg.db,
keys = as.character(protein_DGE$`Protein Id`),
columns = c("ENTREZID"),
keytype = "UNIPROT")
## 'select()' returned 1:many mapping between keys and columns
# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_entrez$ENTREZID) == FALSE)
# Return only the non-duplicated genes using indices
annotations_entrez <- annotations_entrez[non_duplicates_idx, ]
# Check number of NAs returned
is.na(annotations_entrez$ENTREZID) %>%
which() %>%
length()
## [1] 1
# annotate the dataset with Entrez ID
protein_DGE$Entrez_ID <- NA
for (i in 1:dim(protein_DGE)[1]) {
index <- grep(protein_DGE$`Protein Id`[i], annotations_entrez$UNIPROT)
if (length(index)>0) {
protein_DGE$Entrez_ID[i] <- paste(annotations_entrez$ENTREZID[index], collapse = " ")
}
}
# load the geneset data
load("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/CLIP_Analysis/Data Visualization/GSEA Analysis/mouse_H_v5p2.rdata")
pathways <- Mm.H
# GSEA on Hallmark gene set for the top 20 miRNAs
GSEA.list <- list()
mirname <- c()
plist <- c()
for (i in 1:20) {
gseadata <- mirs.peaks[[i]]$target_Entrez_ID
gseadata <- gseadata[!is.na(gseadata)]
ranks <- protein_DGE[protein_DGE$Entrez_ID %in% gseadata, ]$LFC
names(ranks) <- protein_DGE[protein_DGE$Entrez_ID %in% gseadata, ]$Entrez_ID
ranks <- sort(ranks, decreasing = T)
fgseaRes <- fgsea(pathways, ranks, minSize=15, maxSize = 500, nperm=1000)
fgseaRes <- fgseaRes[fgseaRes$padj < 0.1,]
suppressMessages(fgseaRes[, leadingEdge := lapply(leadingEdge, mapIds, x=org.Mm.eg.db, keytype="ENTREZID", column="SYMBOL")])
fgseaRes$leadingEdge <- as.character(fgseaRes$leadingEdge)
if (dim(fgseaRes)[1] > 0) {
GSEA.list <- c(GSEA.list, list(as.data.frame(fgseaRes)))
mirname <- c(mirname,mirna[i])
write.csv(as.data.frame(fgseaRes), paste("GSEA Analysis/", str_remove_all(mirna[i], "/"), "_Hallmark.csv", sep = ""))
for (n in 1:length(fgseaRes$pathway)) {
p <- plotEnrichment(pathways[[fgseaRes$pathway[n]]], ranks) + labs(title = paste(mirna[i],"-",fgseaRes$pathway[n], sep = ""))
print(p)
}
plotGseaTable(pathways[fgseaRes$pathway], ranks, fgseaRes, gseaParam = 0.5)
}
}
names(GSEA.list) <- mirname
saveRDS(GSEA.list, "Datafiles/peak.GSEA.Hallmark.result.rds")
# load the geneset data
load("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/CLIP_Analysis/Data Visualization/GSEA Analysis/mouse_c2_v5p2.rdata")
pathways <- Mm.c2
# GSEA on C2 gene set for the top 20 miRNAs
GSEA.list <- list()
mirname <- c()
plist <- c()
for (i in 1:20) {
gseadata <- mirs.peaks[[i]]$target_Entrez_ID
gseadata <- gseadata[!is.na(gseadata)]
ranks <- protein_DGE[protein_DGE$Entrez_ID %in% gseadata, ]$LFC
names(ranks) <- protein_DGE[protein_DGE$Entrez_ID %in% gseadata, ]$Entrez_ID
ranks <- sort(ranks, decreasing = T)
fgseaRes <- fgsea(pathways, ranks, minSize=15, maxSize = 500, nperm=1000)
fgseaRes <- fgseaRes[fgseaRes$padj < 0.1,]
suppressMessages(fgseaRes[, leadingEdge := lapply(leadingEdge, mapIds, x=org.Mm.eg.db, keytype="ENTREZID", column="SYMBOL")])
fgseaRes$leadingEdge <- as.character(fgseaRes$leadingEdge)
if (dim(fgseaRes)[1] > 0) {
GSEA.list <- c(GSEA.list, list(as.data.frame(fgseaRes)))
mirname <- c(mirname,mirna[i])
write.csv(as.data.frame(fgseaRes), paste("GSEA Analysis/", str_remove_all(mirna[i], "/"), "_C2.csv", sep = ""))
topPathwaysUp <- fgseaRes[ES > 0][head(order(padj), n=10), pathway]
topPathwaysDown <- fgseaRes[ES < 0][head(order(padj), n=10), pathway]
topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
print(mirna[i])
plotGseaTable(pathways[topPathways], ranks, fgseaRes, gseaParam = 0.5)
}
}
## [1] "let-7-5p/miR-98-5p"
## [1] "miR-29-3p"
## [1] "miR-103-3p/107-3p"
## [1] "miR-196-5p"
## [1] "miR-26-5p"
## [1] "miR-25-3p/32-5p/92-3p/363-3p/367-3p"
## [1] "miR-21-3p"
names(GSEA.list) <- mirname
saveRDS(GSEA.list, "Datafiles/peak.GSEA.C2.result.rds")
# load the geneset data
load("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/CLIP_Analysis/Data Visualization/GSEA Analysis/mouse_c3_v5p2.rdata")
pathways <- Mm.c3
# GSEA on C2 gene set for the top 20 miRNAs
GSEA.list <- list()
mirname <- c()
plist <- c()
for (i in 1:20) {
gseadata <- mirs.peaks[[i]]$target_Entrez_ID
gseadata <- gseadata[!is.na(gseadata)]
ranks <- protein_DGE[protein_DGE$Entrez_ID %in% gseadata, ]$LFC
names(ranks) <- protein_DGE[protein_DGE$Entrez_ID %in% gseadata, ]$Entrez_ID
ranks <- sort(ranks, decreasing = T)
fgseaRes <- fgsea(pathways, ranks, minSize=15, maxSize = 500, nperm=1000)
fgseaRes <- fgseaRes[fgseaRes$padj < 0.1,]
suppressMessages(fgseaRes[, leadingEdge := lapply(leadingEdge, mapIds, x=org.Mm.eg.db, keytype="ENTREZID", column="SYMBOL")])
fgseaRes$leadingEdge <- as.character(fgseaRes$leadingEdge)
if (dim(fgseaRes)[1] > 0) {
GSEA.list <- c(GSEA.list, list(as.data.frame(fgseaRes)))
mirname <- c(mirname,mirna[i])
write.csv(as.data.frame(fgseaRes), paste("GSEA Analysis/", str_remove_all(mirna[i], "/"), "_C3.csv", sep = ""))
topPathwaysUp <- fgseaRes[ES > 0][head(order(padj), n=10), pathway]
topPathwaysDown <- fgseaRes[ES < 0][head(order(padj), n=10), pathway]
topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
print(mirna[i])
plotGseaTable(pathways[topPathways], ranks, fgseaRes, gseaParam = 0.5)
}
}
## [1] "let-7-5p/miR-98-5p"
## [1] "miR-29-3p"
## [1] "miR-200-3p/429-3p"
## [1] "miR-27-3p"
## [1] "miR-196-5p"
## [1] "miR-26-5p"
## [1] "miR-25-3p/32-5p/92-3p/363-3p/367-3p"
## [1] "miR-22-3p"
names(GSEA.list) <- mirname
saveRDS(GSEA.list, "Datafiles/peak.GSEA.C3.result.rds")
# compile a matrix containing all genesets that had enrichment for all 20 miRNAs and their enrichment score
hall.GSEA <- readRDS("Datafiles/peak.GSEA.Hallmark.result.rds")
c2.GSEA <- readRDS("Datafiles/peak.GSEA.C2.result.rds")
c3.GSEA <- readRDS("Datafiles/peak.GSEA.C3.result.rds")
gsea.matrix <- data.frame(c(1:20))
gsea.matrix <- t(gsea.matrix)
colnames(gsea.matrix) <- mirna[1:20]
gsea.matrix[1,] <- NA
for (i in 1:length(hall.GSEA)) {
for (n in 1:length(hall.GSEA[[i]]$pathway)) {
if (length(grep(hall.GSEA[[i]]$pathway[n], rownames(gsea.matrix))) > 0) {
col.index <- grep(names(hall.GSEA[i]), colnames(gsea.matrix))
row.index <- grep(hall.GSEA[[i]]$pathway[n], rownames(gsea.matrix))
gsea.matrix[row.index, col.index] <- hall.GSEA[[i]]$ES[n]
}
else {
gsea.matrix <- rbind(gsea.matrix, rep(NA,20))
rownames(gsea.matrix)[dim(gsea.matrix)[1]] <- hall.GSEA[[i]]$pathway[n]
col.index <- grep(names(hall.GSEA[i]), colnames(gsea.matrix))
gsea.matrix[dim(gsea.matrix)[1], col.index] <- hall.GSEA[[i]]$ES[n]
}
}
}
gsea.matrix <- gsea.matrix[-1,]
for (i in 1:length(c2.GSEA)) {
for (n in 1:length(c2.GSEA[[i]]$pathway)) {
if (length(grep(c2.GSEA[[i]]$pathway[n], rownames(gsea.matrix))) > 0) {
col.index <- grep(names(c2.GSEA[i]), colnames(gsea.matrix))
row.index <- grep(c2.GSEA[[i]]$pathway[n], rownames(gsea.matrix))
gsea.matrix[row.index, col.index] <- c2.GSEA[[i]]$ES[n]
}
else {
gsea.matrix <- rbind(gsea.matrix, rep(NA,20))
rownames(gsea.matrix)[dim(gsea.matrix)[1]] <- c2.GSEA[[i]]$pathway[n]
col.index <- grep(names(c2.GSEA[i]), colnames(gsea.matrix))
gsea.matrix[dim(gsea.matrix)[1], col.index] <- c2.GSEA[[i]]$ES[n]
}
}
}
for (i in 1:length(c3.GSEA)) {
for (n in 1:length(c3.GSEA[[i]]$pathway)) {
if (length(grep(c3.GSEA[[i]]$pathway[n], rownames(gsea.matrix))) > 0) {
col.index <- grep(names(c3.GSEA[i]), colnames(gsea.matrix))
row.index <- grep(c3.GSEA[[i]]$pathway[n], rownames(gsea.matrix))
gsea.matrix[row.index, col.index] <- c3.GSEA[[i]]$ES[n]
}
else {
gsea.matrix <- rbind(gsea.matrix, rep(NA,20))
rownames(gsea.matrix)[dim(gsea.matrix)[1]] <- c3.GSEA[[i]]$pathway[n]
col.index <- grep(names(c3.GSEA[i]), colnames(gsea.matrix))
gsea.matrix[dim(gsea.matrix)[1], col.index] <- c3.GSEA[[i]]$ES[n]
}
}
}
gsea.matrix <- as.matrix(gsea.matrix)
my_palette <- colorRampPalette(c("blue", "white", "red"))(256)
na_color <- colorRampPalette("grey")
png('GSEA Analysis/GSEA_HEAP-CLIP_peaks.png',
width = 1000,
height = 1000,
res = 100,
pointsize = 8)
par(cex.main=1.8)
heatmap.2(gsea.matrix,
main = "GSEA for 20 most active miRNAs from HEAP-CLIP",
density.info = "none",
key = TRUE,
lwid = c(1,7),
lhei = c(1,7),
col=my_palette,
labRow = FALSE,
margins = c(20,7),
symbreaks = TRUE,
trace = "none",
Rowv=FALSE,
Colv=FALSE,
dendrogram = "none",
na.color = "grey",
cexCol = 1,
na.rm = FALSE)
dev.off()
## quartz_off_screen
## 2
GSEA